library(tidyverse)
library(lubridate)
library(tidyquant)
library(reticulate)
library(plotly)
# Import dataset--------------------------------
data <- read_csv("data/ICD10_F43.2_patients.csv")
data_tib <- as_tibble(data)
# Data wrangling-------------------------------
# Replace sex variables with more descriptive names
data_tib$Sex <- as.character(data_tib$Sex)
data_tib$Sex[data_tib$Sex == "f"] <- "Female"
data_tib$Sex[data_tib$Sex == "m"] <- "Male"
data_tib$Sex <- as.factor(data_tib$Sex)
data_tib <- data_tib %>%
mutate(Birth_year = year(`date-of-birth`)) %>%
glimpse()
## Observations: 1,212
## Variables: 6
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Sex <fct> Female, Male, Female, Female, Male, Female, Female...
## $ `date-of-birth` <date> 1989-11-02, 1972-01-29, 1968-10-05, 1982-08-24, 1...
## $ `total-sessions` <dbl> 11, 18, 20, 10, 31, 14, 13, 10, 34, 12, 30, 13, 15...
## $ `therapy-year` <dbl> 2020, 2019, 2020, 2020, 2008, 2020, 2020, 2020, 20...
## $ Birth_year <dbl> 1989, 1972, 1968, 1982, 1971, 2002, 1985, 2001, 19...
data_tib <- data_tib %>%
mutate(age = `therapy-year` - Birth_year) %>%
glimpse()
## Observations: 1,212
## Variables: 7
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Sex <fct> Female, Male, Female, Female, Male, Female, Female...
## $ `date-of-birth` <date> 1989-11-02, 1972-01-29, 1968-10-05, 1982-08-24, 1...
## $ `total-sessions` <dbl> 11, 18, 20, 10, 31, 14, 13, 10, 34, 12, 30, 13, 15...
## $ `therapy-year` <dbl> 2020, 2019, 2020, 2020, 2008, 2020, 2020, 2020, 20...
## $ Birth_year <dbl> 1989, 1972, 1968, 1982, 1971, 2002, 1985, 2001, 19...
## $ age <dbl> 31, 47, 52, 38, 37, 18, 35, 19, 23, 23, 27, 55, 29...
# How many males in dataset?
males_tib <- data_tib %>%
filter(data_tib$Sex == "m") %>%
glimpse()
## Observations: 0
## Variables: 7
## $ id <dbl>
## $ Sex <fct>
## $ `date-of-birth` <date>
## $ `total-sessions` <dbl>
## $ `therapy-year` <dbl>
## $ Birth_year <dbl>
## $ age <dbl>
# How many females in dataset?
females_tib <- data_tib %>%
filter(data_tib$Sex == "f") %>%
glimpse()
## Observations: 0
## Variables: 7
## $ id <dbl>
## $ Sex <fct>
## $ `date-of-birth` <date>
## $ `total-sessions` <dbl>
## $ `therapy-year` <dbl>
## $ Birth_year <dbl>
## $ age <dbl>
# Visualise the dataset
fig <- ggplot(data = data_tib, aes(x = age, y = `total-sessions`, color = Sex)) +
geom_point(alpha = 0.5) +
labs(
title = "Visualisation of data set",
subtitle = "Therapy sessions according to age and sex",
x_lab= "Age",
y_lab="Sessions")+
theme_tq()
# Add interactivity
ggplotly(fig)
# Remove outliers
# Remove all ages less than 13 and more than 70
# Remove all data where 4 < total-sessions <= 20
data_tib <- data_tib %>%
filter(age > 12, age < 71, `total-sessions`<= 20,`total-sessions`>4) %>%
glimpse()
## Observations: 1,108
## Variables: 7
## $ id <dbl> 1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 16, 17, 18, 1...
## $ Sex <fct> Female, Male, Female, Female, Female, Female, Fema...
## $ `date-of-birth` <date> 1989-11-02, 1972-01-29, 1968-10-05, 1982-08-24, 2...
## $ `total-sessions` <dbl> 11, 18, 20, 10, 14, 13, 10, 12, 13, 15, 12, 14, 20...
## $ `therapy-year` <dbl> 2020, 2019, 2020, 2020, 2020, 2020, 2020, 2019, 20...
## $ Birth_year <dbl> 1989, 1972, 1968, 1982, 2002, 1985, 2001, 1996, 19...
## $ age <dbl> 31, 47, 52, 38, 18, 35, 19, 23, 55, 29, 58, 57, 34...
# check if outliers were removed
summary(data_tib)
## id Sex date-of-birth total-sessions
## Min. : 1.0 Female:744 Min. :1941-02-09 Min. : 5.00
## 1st Qu.: 303.8 Male :364 1st Qu.:1966-01-20 1st Qu.: 8.00
## Median : 604.5 Median :1976-02-18 Median :12.00
## Mean : 606.8 Mean :1976-03-26 Mean :11.64
## 3rd Qu.: 912.2 3rd Qu.:1986-12-31 3rd Qu.:14.00
## Max. :1212.0 Max. :2004-10-21 Max. :20.00
## therapy-year Birth_year age
## Min. :1997 Min. :1941 Min. :13.00
## 1st Qu.:2010 1st Qu.:1966 1st Qu.:27.00
## Median :2014 Median :1976 Median :36.00
## Mean :2013 Mean :1976 Mean :37.15
## 3rd Qu.:2017 3rd Qu.:1986 3rd Qu.:47.00
## Max. :2020 Max. :2004 Max. :68.00
fig_no_outliers <- ggplot(data = data_tib, aes(x = age, y = `total-sessions`, color = Sex)) +
geom_point(alpha = 0.5) +
labs(
title = "Visualisation of data set",
subtitle = "Therapy sessions according to age and sex",
x_lab= "Age",
y_lab="Sessions")+
theme_tq()
# Visualisation with outliers removed
fig_no_outliers

ggplotly(fig_no_outliers)
# Split up data into females and males
males_tib <- data_tib %>%
filter(Sex == "Male") %>%
glimpse()
## Observations: 364
## Variables: 7
## $ id <dbl> 2, 12, 13, 22, 26, 28, 29, 31, 33, 34, 42, 43, 46,...
## $ Sex <fct> Male, Male, Male, Male, Male, Male, Male, Male, Ma...
## $ `date-of-birth` <date> 1972-01-29, 1965-05-31, 1990-06-24, 1968-06-07, 2...
## $ `total-sessions` <dbl> 18, 13, 15, 13, 12, 14, 8, 15, 13, 17, 6, 16, 10, ...
## $ `therapy-year` <dbl> 2019, 2020, 2019, 2020, 2019, 2017, 2019, 2019, 20...
## $ Birth_year <dbl> 1972, 1965, 1990, 1968, 2004, 1970, 1988, 1974, 19...
## $ age <dbl> 47, 55, 29, 52, 15, 47, 31, 45, 51, 46, 32, 40, 35...
females_tib <- data_tib %>%
filter(Sex =="Female") %>%
glimpse()
## Observations: 744
## Variables: 7
## $ id <dbl> 1, 3, 4, 6, 7, 8, 10, 14, 16, 17, 18, 19, 20, 21, ...
## $ Sex <fct> Female, Female, Female, Female, Female, Female, Fe...
## $ `date-of-birth` <date> 1989-11-02, 1968-10-05, 1982-08-24, 2002-08-05, 1...
## $ `total-sessions` <dbl> 11, 20, 10, 14, 13, 10, 12, 12, 14, 20, 13, 13, 8,...
## $ `therapy-year` <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2019, 2020, 20...
## $ Birth_year <dbl> 1989, 1968, 1982, 2002, 1985, 2001, 1996, 1962, 19...
## $ age <dbl> 31, 52, 38, 18, 35, 19, 23, 58, 57, 34, 16, 52, 58...
# Split data up into data and labels (prepare to apply machine learning model)
female_dataset <- females_tib %>%
select(age) %>%
glimpse()
## Observations: 744
## Variables: 1
## $ age <dbl> 31, 52, 38, 18, 35, 19, 23, 58, 57, 34, 16, 52, 58, 46, 21, 32,...
female_labels <- females_tib %>%
select(`total-sessions`) %>%
glimpse()
## Observations: 744
## Variables: 1
## $ `total-sessions` <dbl> 11, 20, 10, 14, 13, 10, 12, 12, 14, 20, 13, 13, 8,...
males_dataset <- males_tib %>%
select(age) %>%
glimpse()
## Observations: 364
## Variables: 1
## $ age <dbl> 47, 55, 29, 52, 15, 47, 31, 45, 51, 46, 32, 40, 35, 30, 26, 20,...
males_labels <- males_tib %>%
select(`total-sessions`) %>%
glimpse()
## Observations: 364
## Variables: 1
## $ `total-sessions` <dbl> 18, 13, 15, 13, 12, 14, 8, 15, 13, 17, 6, 16, 10, ...
library(tidyverse)
library(reticulate)
# set conda environment
use_condaenv("py3.8", required = TRUE)
# check if the correct environment is being used
py_config()
## python: C:/Users/Susanna/Anaconda3/envs/py3.8/python.exe
## libpython: C:/Users/Susanna/Anaconda3/envs/py3.8/python38.dll
## pythonhome: C:/Users/Susanna/Anaconda3/envs/py3.8
## version: 3.8.3 (default, May 19 2020, 06:50:17) [MSC v.1916 64 bit (AMD64)]
## Architecture: 64bit
## numpy: C:/Users/Susanna/Anaconda3/envs/py3.8/Lib/site-packages/numpy
## numpy_version: 1.18.1
##
## NOTE: Python version was forced by use_python function
# is python working?
1+1
## 2
# XGBoost algorithm--------------------------------------------
# import python libraries here
# in conda terminal (with py3.8 env activated, install xgboost): pip install xgboost
import xgboost as xgb
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
# write tibble files to .csv files
female_dataset_py <- write_csv(female_dataset,
path = "female_dataset_py.csv")
female_labels_py <- write_csv(female_labels,
path = "female_labels_py.csv")
males_dataset_py <- write_csv(males_dataset,
path = "males_dataset_py.csv")
males_labels_py <- write_csv(males_labels,
path = "males_labels_py.csv")
# Import the created .csv files
# Females
data_female = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/female_dataset_py.csv')
data_female_labels = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/female_labels_py.csv')
female_dataset_py = pd.DataFrame(data_female)
female_labels_py = pd.DataFrame(data_female_labels)
# Check to see if it is working
print (female_dataset_py.head(), female_labels_py.head())
## age
## 0 31
## 1 52
## 2 38
## 3 18
## 4 35 total-sessions
## 0 11
## 1 20
## 2 10
## 3 14
## 4 13
# Import the created .csv files----------------------------------
# Males
data_male = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/males_dataset_py.csv')
data_male_labels = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/males_labels_py.csv')
male_dataset_py = pd.DataFrame(data_male)
male_labels_py = pd.DataFrame(data_male_labels)
# Check to see if it is working
print (male_dataset_py.head(), male_labels_py.head())
## age
## 0 47
## 1 55
## 2 29
## 3 52
## 4 15 total-sessions
## 0 18
## 1 13
## 2 15
## 3 13
## 4 12
# Split up the data into training and testing sets
# Females
females_data_train, females_data_test = train_test_split(female_dataset_py,
test_size=0.33,
random_state=42)
females_labels_train, females_labels_test = train_test_split(female_labels_py,
test_size=0.33,
random_state=42)
# Split up the data into training and testing sets
# Males
males_data_train, males_data_test = train_test_split(male_dataset_py,
test_size=0.33,
random_state=42)
males_labels_train, males_labels_test = train_test_split(male_labels_py,
test_size=0.33,
random_state=42)
# Check if the data is split
print(males_labels_train)
## total-sessions
## 118 9
## 31 13
## 36 13
## 284 6
## 181 12
## .. ...
## 71 9
## 106 6
## 270 12
## 348 7
## 102 6
##
## [243 rows x 1 columns]
Model 1: XGBoost Classifier algorithm————————
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# Change the data shape of the labels
females_labels_train = np.ravel(females_labels_train)
females_labels_test = np.ravel(females_labels_test)
males_labels_train = np.ravel(males_labels_train)
males_labels_test = np.ravel(males_labels_test)
# Train the model
model = XGBClassifier(max_depth=6,
eta=0.01,
gamma=4,
min_child_weight=6,
subsample=0.8,
#silent=0,
objective='binary:logistic',
n_estimators=5,
seed=1729
)
model_xgb_f = model
model_xgb_m = model
model_xgb_f.fit(females_data_train, females_labels_train)
## XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
## colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=4,
## gpu_id=-1, importance_type='gain', interaction_constraints='',
## learning_rate=0.00999999978, max_delta_step=0, max_depth=6,
## min_child_weight=6, missing=nan, monotone_constraints='()',
## n_estimators=5, n_jobs=0, num_parallel_tree=1,
## objective='multi:softprob', random_state=1729, reg_alpha=0,
## reg_lambda=1, scale_pos_weight=None, seed=1729, subsample=0.8,
## tree_method='exact', validate_parameters=1, verbosity=None)
model_xgb_m.fit(males_data_train, males_labels_train)
# Test model
## XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
## colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=4,
## gpu_id=-1, importance_type='gain', interaction_constraints='',
## learning_rate=0.00999999978, max_delta_step=0, max_depth=6,
## min_child_weight=6, missing=nan, monotone_constraints='()',
## n_estimators=5, n_jobs=0, num_parallel_tree=1,
## objective='multi:softprob', random_state=1729, reg_alpha=0,
## reg_lambda=1, scale_pos_weight=None, seed=1729, subsample=0.8,
## tree_method='exact', validate_parameters=1, verbosity=None)
pred_xgb_f = model_xgb_f.predict(females_data_test)
pred_xgb_m = model_xgb_m.predict(males_data_test)
xgb_acc_f = accuracy_score(females_labels_test, pred_xgb_f)
xgb_acc_m = accuracy_score(males_labels_test, pred_xgb_m)
print("\n")
print("XGB model accuracy for females: {:.3f} %\n".format(xgb_acc_f * 100))
## XGB model accuracy for females: 10.163 %
print("XGB model accuracy for males: {:.3f} %\n".format(xgb_acc_m * 100))
## XGB model accuracy for males: 11.570 %
Model 2: XGBoost Regressor algorithm————————
# Model 2: XGB Regressor
# Here we will be calculating the loss using RMSE (root-mean-square-error)
from sklearn.metrics import mean_squared_error
model2 = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=12,
seed=123,
learning_rate=0.45
)
# Train the model
model_xgbR_f = model2
model_xgbR_m = model2
model_xgbR_f.fit(females_data_train, females_labels_train, eval_metric='rmse')
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
## colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
## importance_type='gain', interaction_constraints='',
## learning_rate=0.45, max_delta_step=0, max_depth=6,
## min_child_weight=1, missing=nan, monotone_constraints='()',
## n_estimators=12, n_jobs=0, num_parallel_tree=1,
## objective='reg:squarederror', random_state=123, reg_alpha=0,
## reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
## tree_method='exact', validate_parameters=1, verbosity=None)
model_xgbR_m.fit(males_data_train, males_labels_train, eval_metric='rmse')
# Test model
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
## colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
## importance_type='gain', interaction_constraints='',
## learning_rate=0.45, max_delta_step=0, max_depth=6,
## min_child_weight=1, missing=nan, monotone_constraints='()',
## n_estimators=12, n_jobs=0, num_parallel_tree=1,
## objective='reg:squarederror', random_state=123, reg_alpha=0,
## reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
## tree_method='exact', validate_parameters=1, verbosity=None)
pred_xgbR_f = model_xgbR_f.predict(females_data_test)
pred_xgbR_m = model_xgbR_m.predict(males_data_test)
rmse_f = np.sqrt(mean_squared_error(females_labels_test, pred_xgbR_f))
rmse_m = np.sqrt(mean_squared_error(males_labels_test, pred_xgbR_m))
print("\n")
print("RMSE for females: {:.3f} \n".format(rmse_f))
## RMSE for females: 4.222
print("RMSE for males: {:.3f} \n".format(rmse_m))
## RMSE for males: 3.740
Model 3: Naive Bayes algorithm————————
from sklearn.naive_bayes import GaussianNB
# Change the data shapes for the labels using np.ravel()
# Train set
females_labels_train_nb = np.ravel(females_labels_train)
males_labels_train_nb = np.ravel(males_labels_train)
# Test set
females_labels_test_nb = np.ravel(females_labels_test)
males_labels_test_nb = np.ravel(males_labels_test)
model3 = GaussianNB(var_smoothing=0.000000001)
# Train the model
model_nb_f = model3
model_nb_m = model3
model_nb_f.fit(females_data_train, females_labels_train_nb)
## GaussianNB(priors=None, var_smoothing=1e-09)
model_nb_m.fit(males_data_train, males_labels_train_nb)
# Test model
## GaussianNB(priors=None, var_smoothing=1e-09)
pred_nb_f = model_nb_f.predict(females_data_test)
pred_nb_m = model_nb_m.predict(males_data_test)
nb_acc_f = accuracy_score(females_labels_test_nb, pred_nb_f)
nb_acc_m = accuracy_score(males_labels_test_nb, pred_nb_m)
print("\n")
print("NB model accuracy for females: {:.3f} %\n".format(nb_acc_f * 100))
## NB model accuracy for females: 11.382 %
print("NB model accuracy for males: {:.3f} %\n".format(nb_acc_m * 100))
## NB model accuracy for males: 13.223 %
Model 4: Support Vector Machine algorithm——————
#from sklearn.svm import SVC
from sklearn.svm import LinearSVC
# Change the data shapes for the labels using np.ravel()
# Train set
females_labels_train_svm = np.ravel(females_labels_train)
males_labels_train_svm = np.ravel(males_labels_train)
# Test set
females_labels_test_svm = np.ravel(females_labels_test)
males_labels_test_svm = np.ravel(males_labels_test)
model4 = LinearSVC(C = 1000,
max_iter=1000000,
dual=False
)
# Train the model
model_svm_f = model4
model_svm_m = model4
model_svm_f.fit(females_data_train, females_labels_train_svm)
## LinearSVC(C=1000, class_weight=None, dual=False, fit_intercept=True,
## intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
## multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
## verbose=0)
model_svm_m.fit(males_data_train, males_labels_train_svm)
# Test model
## LinearSVC(C=1000, class_weight=None, dual=False, fit_intercept=True,
## intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
## multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
## verbose=0)
pred_svm_f = model_svm_f.predict(females_data_test)
pred_svm_m = model_svm_m.predict(males_data_test)
svm_acc_f = accuracy_score(females_labels_test_svm, pred_svm_f)
svm_acc_m = accuracy_score(males_labels_test_svm, pred_svm_m)
print("\n")
print("SVM model accuracy for females: {:.3f} %\n".format(svm_acc_f * 100))
## SVM model accuracy for females: 10.163 %
print("SVM model accuracy for males: {:.3f} %\n".format(svm_acc_m * 100))
## SVM model accuracy for males: 12.397 %
Model 5: Logistic Regression algorithm——————
from sklearn.linear_model import LogisticRegression
model5 = LogisticRegression(max_iter=100000,
C=1,
solver='saga',
dual=False
)
# Train the model
model_lr_f = model5
model_lr_m = model5
model_lr_f.fit(females_data_train, females_labels_train_nb)
## LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
## intercept_scaling=1, l1_ratio=None, max_iter=100000,
## multi_class='auto', n_jobs=None, penalty='l2',
## random_state=None, solver='saga', tol=0.0001, verbose=0,
## warm_start=False)
model_lr_m.fit(males_data_train, males_labels_train_nb)
# Test model
## LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
## intercept_scaling=1, l1_ratio=None, max_iter=100000,
## multi_class='auto', n_jobs=None, penalty='l2',
## random_state=None, solver='saga', tol=0.0001, verbose=0,
## warm_start=False)
pred_lr_f = model_lr_f.predict(females_data_test)
pred_lr_m = model_lr_m.predict(males_data_test)
lr_acc_f = accuracy_score(females_labels_test_nb, pred_lr_f)
lr_acc_m = accuracy_score(males_labels_test_nb, pred_lr_m)
print("\n")
print("LR model accuracy for females: {:.3f} %\n".format(lr_acc_f * 100))
## LR model accuracy for females: 10.976 %
print("LR model accuracy for males: {:.3f} %\n".format(lr_acc_m * 100))
## LR model accuracy for males: 14.050 %